Topics:
**if time
Note: We’re skipping string manipulation
## [1] "Last knit to html at: 2015-09-24 11:47:10"
## Here is some of my unpublished data -- if you publish it, please include me as an author!
## but seriously, please don't share.
# flowerFile <- file.choose()
flowerFile <- "/Users/callinswitzer/Dropbox/Harvard/RByCall/RClassMaterials/Session2/DataSets/CallinPollen.csv"
flr <- read.csv(flowerFile)
flr[ 1:5 , 1:4] # shows the first 5 rows, first 4 columns
## flowerNum trt daysOpen distance
## 1 14 Ambient 4 3
## 2 14 Ambient 4 3
## 3 15 Ambient 3 4
## 4 15 Ambient 3 4
## 5 16 Ambient 2 4
Session -> Set Working Directory -> Choose Directory
# gets your working directory
getwd()
## [1] "/Users/callinswitzer/Dropbox/Harvard/RByCall/rclassmaterials/Session2"
# sets your working directory
setwd("/Users/callinswitzer/Dropbox/Harvard/rbycall/RClassMaterials/Session2")
dir() # prints stuff in your working directory
file.show() # allows you to open files
You can click “Import Dataset”, or write the code yourself
# read.table is another (possibly more flexible) way to read in data
# get names of files in datasets folder
dir("datasets")
## [1] "arrests.txt" "CallinPollen.csv" "loblolly.txt"
## [4] "mtcars.txt" "spray-easy.txt" "spray.txt"
## [7] "toothGrowth.txt" "yarn.txt"
# view the file
file.show("datasets/mtcars.txt")
# read in space-separated file
crs <- read.table("datasets/mtcars.txt", sep = " ")
head(crs)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## you can also import directly from a web URL
# this is a dataset about rats from crawley book
URL <- "http://nature.berkeley.edu/~casterln/crawley/rats.txt"
rats <- read.csv(URL)
head(rats) # not quite right
## Glycogen.Treatment.Rat.Liver
## 1 131\t1\t1\t1
## 2 130\t1\t1\t1
## 3 131\t1\t1\t2
## 4 125\t1\t1\t2
## 5 136\t1\t1\t3
## 6 142\t1\t1\t3
rats <- read.csv(URL, sep = "\t")
head(rats)
## Glycogen Treatment Rat Liver
## 1 131 1 1 1
## 2 130 1 1 1
## 3 131 1 1 2
## 4 125 1 1 2
## 5 136 1 1 3
## 6 142 1 1 3
Exercises(1):
file.show("datasets/mtcars.txt")
# looks like a " " is being used there
crs <- read.csv("datasets/mtcars.txt", sep = " ")
head(crs)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# answer
# read in data
gl <- read.table("datasets/spray.txt", stringsAsFactors = FALSE)
# get rid of header
glsm <- gl[2:nrow(gl), 2]
# make it a data frame
df1 <- data.frame(t(data.frame(strsplit(glsm, "GoodLuck\\!"))))
# clean up row names
row.names(df1) <- 1:nrow(df1)
# get rid of extra column
df1 <- df1[, 2:5]
# replace column names
nms <- unlist(strsplit(paste(gl[1,1], gl[1,2], split = ""), split = "GoodLuck\\!"))
nms <- gsub(x = nms, pattern = "[^[:alnum:]]", replacement = "")
names(df1) <- nms
# check our work....close enough :)
df1
## decrease rowpos colpos treatment
## 1 57 1 1 "D"
## 2 95 2 1 "E"
## 3 8 3 1 "B"
## 4 69 4 1 "H"
## 5 92 5 1 "G"
## 6 90 6 1 "F"
## 7 15 7 1 "C"
## 8 2 8 1 "A"
## 9 84 1 2 "C"
## 10 6 2 2 "B"
## 11 127 3 2 "H"
## 12 36 4 2 "D"
## 13 51 5 2 "E"
## 14 2 6 2 "A"
## 15 69 7 2 "F"
## 16 71 8 2 "G"
## 17 87 1 3 "F"
## 18 72 2 3 "H"
## 19 5 3 3 "A"
## 20 39 4 3 "E"
## 21 22 5 3 "D"
## 22 16 6 3 "C"
## 23 72 7 3 "G"
## 24 4 8 3 "B"
## 25 130 1 4 "H"
## 26 4 2 4 "A"
## 27 114 3 4 "E"
## 28 9 4 4 "C"
## 29 20 5 4 "F"
## 30 24 6 4 "G"
## 31 10 7 4 "B"
## 32 51 8 4 "D"
## 33 43 1 5 "E"
## 34 28 2 5 "D"
## 35 60 3 5 "G"
## 36 5 4 5 "A"
## 37 17 5 5 "C"
## 38 7 6 5 "B"
## 39 81 7 5 "H"
## 40 71 8 5 "F"
## 41 12 1 6 "A"
## 42 29 2 6 "C"
## 43 44 3 6 "F"
## 44 77 4 6 "G"
## 45 4 5 6 "B"
## 46 27 6 6 "D"
## 47 47 7 6 "E"
## 48 76 8 6 "H"
## 49 8 1 7 "B"
## 50 72 2 7 "G"
## 51 13 3 7 "C"
## 52 57 4 7 "F"
## 53 4 5 7 "A"
## 54 81 6 7 "H"
## 55 20 7 7 "D"
## 56 61 8 7 "E"
## 57 80 1 8 "G"
## 58 114 2 8 "F"
## 59 39 3 8 "D"
## 60 14 4 8 "B"
## 61 86 5 8 "H"
## 62 55 6 8 "E"
## 63 3 7 8 "A"
## 64 19 8 8 "C"
# we can use the base plot() function
?plot # gets help about the plot function
# we're using the flr dataset
# different ways to plot
par(mfrow = c(2,2)) # tells R to put 2 X 2 plots per panel
plot(x = flr$humidity, y=flr$temp)
plot(temp~humidity, data = flr)
with(flr, plot(y = temp, x = humidity))
with(flr, plot(y = temp, x = humidity,
main = "Temp and Humidity", # title
ylab = expression(paste("Temperature (",degree,"C)")), # y label
xlab = "Rel. Humidity (%)" , # x label
bty = "l", # type of box around the plot
pch = 20, # what symbol to plot: "point change"?
col = rgb(0,0.5,1, 0.5), # color of the poings, and transparency
cex = 2, # size of points
tck = .03, # length of ticks
las = 1, # axis label direction
ylim = c(10, 40), # y limits
xlim = c(20, 100) # x limits
))
par(mfrow = c(1,1)) # reset the plots per panel
# ?par will show you all of the parameters you can adjust
# can also use dev.off() to clear the plotting device
# histograms help us look at distributions and check for outliers
hist(flr$humidity, breaks = 15)
rug(jitter(flr$humidity)) # jitter() helps you see overlapping points
hist(flr$slope); rug(flr$slope)
# a look ahead
histRug <- function(vec, ...){
hist(vec, ...)
rug(vec)
}
histRug(flr$int, breaks = 15, xlab = "int", main = NULL)
# look at scatterplot matrix of numeric data to view "marginal relationships"
colnames(flr) # figure out which columns I want
## [1] "flowerNum" "trt"
## [3] "daysOpen" "distance"
## [5] "shakeNumber" "temp"
## [7] "humidity" "notes"
## [9] "dropFromAnalysis" "file"
## [11] "pxToMmConversion" "firstPollen"
## [13] "firstPollenOutOfScreen" "blankFrames"
## [15] "slope" "int"
## [17] "aperture" "focus"
## [19] "frameRate" "shutter"
## [21] "amplitude" "freq"
## [23] "sec" "plant"
## [25] "openTime" "DateOfExp"
#install.packages("car")
car::scatterplotMatrix(flr[c("humidity", "temp", "slope", "int")])
# change the paramters for the scatterplots
car::scatterplotMatrix(flr[c("humidity", "temp", "slope", "int")],
pch = 20,
col = c("blue", "red", rgb(.8,0.4, .3, 0.2)))
# Another way:
# library(car)
# scatterplotMatrix(flr[c("humidity", "temp", "slope", "int")])
Exercises(3):
file.show('datasets/arrests.txt')
arrests <- read.table("datasets/arrests.txt", sep = ";")
hist(arrests$UrbanPop); rug(arrests$UrbanPop)
car::scatterplotMatrix(arrests)
plot(mpg~wt, data = crs)
plot(mpg~as.factor(cyl), crs)
# or this
boxplot(mpg ~ cyl, data = crs)
plot(mpg~wt, col = as.numeric(as.factor(crs$cyl)), pch = 17, data = crs)
plot(mpg~wt, col = as.numeric(as.factor(crs$cyl)), pch = 17, data = crs)
legend("topright", legend = paste(levels(as.factor(crs$cyl)), "cylinders"), col = levels(as.factor(as.numeric(as.factor(crs$cyl)))) , pch = 17)
# select rows, all columns
FewRows <- flr[10:20, ]
FewRows
## flowerNum trt daysOpen distance shakeNumber temp humidity
## 10 19 Ambient 3 4 1 23.8 54
## 11 19 Ambient 3 4 2 23.8 54
## 12 20 Ambient 4 6 1 23.8 54
## 13 20 Ambient 4 6 2 23.8 54
## 14 21 Ambient 1 1 1 23.7 55
## 15 21 Ambient 1 1 2 23.7 55
## 16 22 Ambient 1 7 1 24.1 55
## 17 22 Ambient 1 7 2 24.1 55
## 18 23 Ambient 3 1 1 24.2 55
## 19 23 Ambient 3 1 2 24.2 55
## 20 24 Ambient 3 1 1 24.3 55
## notes dropFromAnalysis file pxToMmConversion
## 10 <NA> 019_A_3_04_01 06152014 33.71517
## 11 <NA> 019_A_3_04_02 06152014 33.61158
## 12 <NA> 020_A_4_06_01 06152014 33.58819
## 13 <NA> 020_A_4_06_02 06152014 33.78510
## 14 <NA> 021_A_1_01_01 06152014 33.26512
## 15 <NA> 021_A_1_01_02 06152014 33.56558
## 16 <NA> 022_A_1_07_01 06152014 33.61335
## 17 <NA> 022_A_1_07_02 06152014 33.70947
## 18 petals forward 023_A_3_01_01 06152014 33.53424
## 19 petals forward 023_A_3_01_02 06152014 33.44562
## 20 <NA> 024_A_3_01_01 06152014 33.27383
## firstPollen firstPollenOutOfScreen blankFrames slope int
## 10 136 162 348 0.016562908 0.08087007
## 11 42 96 197 0.007731726 0.07813493
## 12 28 106 312 0.002327221 0.03758674
## 13 53 136 231 0.003886397 0.04457205
## 14 34 74 204 0.018009595 0.13009235
## 15 95 151 261 0.011569053 0.08953817
## 16 143 178 378 0.006574263 0.03176503
## 17 44 87 181 0.003088205 0.01444995
## 18 110 165 326 0.008866537 0.05961880
## 19 68 180 302 0.001279032 0.02574620
## 20 32 88 171 0.023082250 0.20553035
## aperture focus frameRate shutter amplitude freq sec plant
## 10 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 11 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 12 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 13 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 14 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 15 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 16 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 17 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 18 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 19 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 20 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## openTime DateOfExp
## 10 <NA>
## 11 <NA>
## 12 <NA>
## 13 <NA>
## 14 <NA>
## 15 <NA>
## 16 <NA>
## 17 <NA>
## 18 <NA>
## 19 <NA>
## 20 <NA>
# use logical statements to return logical vectors
amb <- flr$trt == "Ambient" # returns a logical vector that is TRUE when flr$trt is ambient
head(amb)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
summary(amb)
## Mode FALSE TRUE NA's
## logical 244 198 0
# Use "&" to refine
ambHighHum <- flr$trt == "Ambient" & flr$humidity > 50
summary(ambHighHum)
## Mode FALSE TRUE NA's
## logical 278 164 0
# Note: can't use x == NA; us is.na() instead
head(flr$openTime == NA) # doesn't work
## [1] NA NA NA NA NA NA
naOpen <- is.na(flr$openTime)
naOpen
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [342] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [408] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [419] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [430] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [441] FALSE FALSE
Use square bracket notation
# select rows where daysOpen == 1, all columns
d1 <- flr[flr$daysOpen == 1, ]
summary(d1)
## flowerNum trt daysOpen distance
## Min. : 17.0 Ambient :46 Min. :1 Min. : 1.000
## 1st Qu.: 34.0 Humidifier : 2 1st Qu.:1 1st Qu.: 2.000
## Median : 46.0 AC : 1 Median :1 Median : 5.000
## Mean : 64.9 AC and dehumidifier : 0 Mean :1 Mean : 4.641
## 3rd Qu.: 73.0 AC set to 62 overnight: 0 3rd Qu.:1 3rd Qu.: 7.000
## Max. :163.0 dehumidifier : 0 Max. :1 Max. :10.000
## (Other) : 0 NA's :10
## shakeNumber temp humidity
## Min. :1.000 Min. :18.80 Min. :50.0
## 1st Qu.:1.000 1st Qu.:24.10 1st Qu.:53.0
## Median :1.000 Median :24.50 Median :55.0
## Mean :1.408 Mean :24.39 Mean :58.9
## 3rd Qu.:2.000 3rd Qu.:24.90 3rd Qu.:55.0
## Max. :2.000 Max. :25.30 Max. :83.0
##
## notes dropFromAnalysis
## petals forward : 4 :47
## two petals stuck together : 2 y: 2
## Anther deformed : 1
## Anther not fit snugly in shaker: 1
## Open on same day as experiments: 1
## (Other) : 0
## NA's :40
## file pxToMmConversion firstPollen
## 017_A_1_01_02 06152014: 1 Min. :32.16 Min. : 1.0
## 021_A_1_01_01 06152014: 1 1st Qu.:33.23 1st Qu.: 63.0
## 021_A_1_01_02 06152014: 1 Median :33.38 Median : 90.0
## 022_A_1_07_01 06152014: 1 Mean :33.37 Mean :105.9
## 022_A_1_07_02 06152014: 1 3rd Qu.:33.63 3rd Qu.:123.0
## 025_A_1_02_01 06152014: 1 Max. :33.89 Max. :438.0
## (Other) :43
## firstPollenOutOfScreen blankFrames slope
## Min. : 66.0 Min. :181.0 Min. :0.0009956
## 1st Qu.:105.0 1st Qu.:261.0 1st Qu.:0.0085708
## Median :138.0 Median :291.0 Median :0.0118328
## Mean :158.6 Mean :345.6 Mean :0.0143197
## 3rd Qu.:175.0 3rd Qu.:334.0 3rd Qu.:0.0180549
## Max. :517.0 Max. :922.0 Max. :0.0386782
##
## int aperture focus frameRate shutter
## Min. :0.01146 Min. :11.0 1:15:49 Min. : 500 "1/1000":39
## 1st Qu.:0.08344 1st Qu.:16.0 1st Qu.: 500 "1/1500": 9
## Median :0.10673 Median :16.0 Median : 500 "1/2000": 1
## Mean :0.10966 Mean :15.9 Mean : 602
## 3rd Qu.:0.13796 3rd Qu.:16.0 3rd Qu.: 500
## Max. :0.29427 Max. :16.0 Max. :1000
##
## amplitude freq sec plant
## Min. :0.38 Min. :279 Min. :0.1792 big002:49
## 1st Qu.:0.38 1st Qu.:279 1st Qu.:0.1792 sml001: 0
## Median :0.38 Median :279 Median :0.1792
## Mean :0.38 Mean :279 Mean :0.1792
## 3rd Qu.:0.38 3rd Qu.:279 3rd Qu.:0.1792
## Max. :0.38 Max. :279 Max. :0.1792
##
## openTime DateOfExp
## after 3pm previous day: 0 :40
## afternoon : 0 6/26/14: 7
## midday : 0 6/24/14: 2
## morning : 0 6/25/14: 0
## morningBeforeTrial : 0 7/2/14 : 0
## overnight : 7 7/22/14: 0
## NA's :42 (Other): 0
# select rows where daysOpen does not = 1, and distance == 1
d1d1 <- flr[flr$daysOpen != 1 & flr$distance == 1, ]
head(d1d1)
## flowerNum trt daysOpen distance shakeNumber temp humidity
## 18 23 Ambient 3 1 1 24.2 55
## 19 23 Ambient 3 1 2 24.2 55
## 20 24 Ambient 3 1 1 24.3 55
## 21 24 Ambient 3 1 2 24.3 55
## 24 26 Ambient 2 1 1 24.4 55
## 25 26 Ambient 2 1 2 24.4 55
## notes dropFromAnalysis file pxToMmConversion
## 18 petals forward 023_A_3_01_01 06152014 33.53424
## 19 petals forward 023_A_3_01_02 06152014 33.44562
## 20 <NA> 024_A_3_01_01 06152014 33.27383
## 21 <NA> 024_A_3_01_02 06152014 33.31807
## 24 petals forward 026_A_2_01_01 06152014 33.60898
## 25 petals forward 026_A_2_01_02 06152014 33.65996
## firstPollen firstPollenOutOfScreen blankFrames slope int
## 18 110 165 326 0.008866537 0.0596188
## 19 68 180 302 0.001279032 0.0257462
## 20 32 88 171 0.023082250 0.2055303
## 21 31 66 202 0.020776064 0.2012348
## 24 95 136 238 0.031098375 0.1135675
## 25 47 102 251 0.007678617 0.0702158
## aperture focus frameRate shutter amplitude freq sec plant
## 18 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 19 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 20 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 21 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 24 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## 25 16 1:15 500 "1/1000" 0.38 279 0.1792115 big002
## openTime DateOfExp
## 18 <NA>
## 19 <NA>
## 20 <NA>
## 21 <NA>
## 24 <NA>
## 25 <NA>
# cleaning data
hist(flr$slope); rug(flr$slope)
# say we want to drop the points where slope > 0.12
sub1 <- flr[flr$slope < 0.12, ]
hist(sub1$slope); rug(sub1$slope)
And tabulating some data from the data frames
# say we want to calculate a new measurement that is a combination of slope + int
flr$NewStat <- flr$slope + flr$int
head(flr$NewStat)
## [1] 0.04545815 0.08858106 0.10037509 0.14428318 0.23411393 0.18618767
# or this will work
flr$NewStat1 <- with(flr, slope + int)
head(flr$NewStat1)
## [1] 0.04545815 0.08858106 0.10037509 0.14428318 0.23411393 0.18618767
# say we want to change one of the variable types in our data frame
class(flr$daysOpen)
## [1] "integer"
flr$daysOpen <- as.factor(flr$daysOpen)
class(flr$daysOpen)
## [1] "factor"
# or this will work
class(flr$flowerNum)
## [1] "integer"
flr <- within(flr, flowerNum <- as.factor(flowerNum))
class(flr$flowerNum)
## [1] "factor"
# calculate averages by individual
# say we want average glycogen for each individual
# URL <- "http://nature.berkeley.edu/~casterln/crawley/rats.txt"
# rats <- read.csv(URL)
# rats <- read.csv(URL, sep = "\t")
summary(rats)
## Glycogen Treatment Rat Liver
## Min. :125.0 Min. :1 Min. :1.0 Min. :1
## 1st Qu.:135.8 1st Qu.:1 1st Qu.:1.0 1st Qu.:1
## Median :141.0 Median :2 Median :1.5 Median :2
## Mean :142.2 Mean :2 Mean :1.5 Mean :2
## 3rd Qu.:150.0 3rd Qu.:3 3rd Qu.:2.0 3rd Qu.:3
## Max. :162.0 Max. :3 Max. :2.0 Max. :3
tapply(X = rats$Glycogen, FUN = mean, INDEX = rats$Rat)
## 1 2
## 138.8333 145.6111
# calcuate mean by treatment and individual
tapply(X = c(rats$Glycogen), FUN = mean, INDEX = interaction(rats$Rat, rats$Treatment))
## 1.1 2.1 1.2 2.2 1.3 2.3
## 132.5000 148.5000 149.6667 152.3333 134.3333 136.0000
tab1 <- tapply(X = flr$slope, INDEX = flr$trt, FUN = mean)
class(tab1)
## [1] "array"
mode(tab1)
## [1] "numeric"
# tabulate your data (get counts)
xt1 <- xtabs(formula = ~trt, data = flr)
xt1
## trt
## AC AC and dehumidifier
## 39 22
## AC set to 62 overnight Ambient
## 10 198
## dehumidifier heat and humidifier
## 45 28
## heat and humidifier in alcove heat set to 80F overnight
## 32 36
## Humidifier
## 32
# convert to data frame for easy saving
tab1 <- as.data.frame(tab1)
Exercises(4):
file.show("datasets/loblolly.txt")
lob <- read.table("datasets/loblolly.txt", header = TRUE, sep = " ")
* check to see if 311 is in your new dataframe with this command: "311 %in% lob.sm$Seed"
lob.sm <- lob[lob$Seed != 311 & lob$height > 5, ]
hist(lob.sm$height); rug(lob.sm$height)
311 %in% lob.sm$Seed # not there
## [1] FALSE
305 %in% lob.sm$Seed # is there
## [1] TRUE
* Advanced plot the histograms side-by-side on one panel, and make sure the x-axis limits are the same for each.
file.show("datasets/yarn.txt")
yarn <- read.table("datasets/yarn.txt", sep = "\t", header = TRUE)
par(mfrow = c(1,2))
limx <- c(0, 80)
hist(yarn$breaks[yarn$wool == "A"], main = "Wool A", xlim = limx)
hist(yarn$breaks[yarn$wool == "B"], main = "Wool B", xlim = limx)
par(mfrow = c(1,1))
tapply(yarn$breaks, INDEX = yarn$wool, FUN = function(x) c(mean = mean(x), median = median(x), range = range(x)))
## $A
## mean median range1 range2
## 31.03704 26.00000 10.00000 70.00000
##
## $B
## mean median range1 range2
## 25.25926 24.00000 13.00000 44.00000
# Write text files
# can also use write.csv()
write.table(x = tab1, file = "Tab1.csv", sep = ",", col.names = FALSE)
# saving plots
pdf(file = "plot1.pdf", width = 5, height = 4) # open graphics device
# can also use postscript()
plot(x = flr$humidity, y=flr$temp) # make your plot
dev.off() # close the graphics device
## quartz_off_screen
## 2
# or just export with the "Export" button
Exercises(5):
pdf("humInt.pdf", width = 4, height = 3)
with(flr, plot(x = humidity, y = int))
dev.off()
## quartz_off_screen
## 2
pdf("fancyHumInt.pdf", width = 5, height = 4)
with(flr, plot(x = jitter(humidity), y = int,
ylab = "Pollen (Integration Method)",
xlab = "Relative Humidity (%)",
bty = "l",
main = "Pollen released vs. Humidity",
pch = 20,
col = rgb(.9, .5, .3, .9)
))
dev.off()
## quartz_off_screen
## 2
flr.dist <- flr[flr$distance > 9 & !is.na(flr$distance), ]
write.csv(flr.dist, file = "flrDist.csv")
use install.packages(“…”), but you only ned to use this the first time you use the library. After that, you can just use library(“…”)
# we def. want ggplot2
# install.packages("ggplot2") # do this only the first time you use a library
library(ggplot2) # now, this package is ready to use
# you can use any function from this library now, without using the :: notation
# help(package = "ggplot2") # gets help for this new package or library
# note: I use package/library interchangeably -- there's probably a difference, but I don't know it.
# let's try it
ggplot(flr, aes(x= humidity, y = int)) +
geom_point(alpha = 0.8, size = 4, position = position_jitter(width = 0.5, height = 0)) +
theme_bw() +
theme( panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
labs(x = "Relative Humidity", y = "Pollen Release (Integration method)",
title = 'Pollen release vs. Humidity') +
geom_smooth(method = "lm", se = FALSE) # add a line without standard error
Exercises(6):
ddply(flr, “shakeNumber”, summarise, mean.Int = mean(int)) You should get this:
## shakeNumber mean.Int
## 1 1 0.11814448
## 2 2 0.08530651
## 3 7 0.02955185
## 4 8 0.03822761
# install.packages("plyr")
library(plyr)
ddply(flr, "shakeNumber", summarise, mean.Int = mean(int))
## shakeNumber mean.Int
## 1 1 0.11814448
## 2 2 0.08530651
## 3 7 0.02955185
## 4 8 0.03822761
# install.packages("ggmap")
library(ggmap)
#citation(ggmap)
mapImageData1 <- get_map(location = c( lon = -71.114573, lat = 42.379036), zoom = 19, maptype = "satellite")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=42.379036,-71.114573&zoom=19&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(mapImageData1)
df1 <- data.frame(person = c("bob", "sue", "jack", "ann"), number1 = c(4,5,6,1), number2 = c(2,4,6,6))
df1
## person number1 number2
## 1 bob 4 2
## 2 sue 5 4
## 3 jack 6 6
## 4 ann 1 6
# we want to plot persons' number1 and number2, but we'd need them to be in the same column
# wrong
ggplot(df1) +
geom_point(aes(x = person, y = number1), color = "red") +
geom_point(aes(x = person, y = number2), color = "blue")
# you can reformat
library(reshape2)
df.melt <- melt(df1, id.vars = "person")
ggplot(df.melt, aes(x = person, y = value)) +
geom_point(aes(color = variable))
## formatting for barcharts
ggplot(df1, aes(x = person, y = number1)) +
geom_bar(stat = "identity")
## or reshape the data frame
df.long <- data.frame(person = unlist(sapply(df1$person,
FUN = function(x)
rep(x, times = df1[x, "number1"]))))
ggplot(df.long, aes(x = person)) +
geom_bar( fill = "red", alpha = 0.6)
# for loops repeat a specific number of times
for(i in 1:10){
print(i + 100)
}
## [1] 101
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## [1] 110
for(ii in df1$person){
print(rep(ii, times = df1[df1$person == ii, "number1"]))
}
## [1] "bob" "bob" "bob" "bob"
## [1] "sue" "sue" "sue" "sue" "sue"
## [1] "jack" "jack" "jack" "jack" "jack" "jack"
## [1] "ann"
## while loops repeat until something happens
x <- 1
while(x < 10){
print(x + 100)
x <- x + 1
}
## [1] 101
## [1] 102
## [1] 103
## [1] 104
## [1] 105
## [1] 106
## [1] 107
## [1] 108
## [1] 109
## Apply functions are often faster than loops
# for loop
system.time({
newVec <- numeric()
for(ii in 1:50000){
newVec[ii] <- ii + 100
}
})
## user system elapsed
## 4.003 8.521 14.625
head(newVec)
## [1] 101 102 103 104 105 106
## sapply function
system.time({
newVec.sa <- sapply(1:100000, FUN = function(x) x + 100)
})
## user system elapsed
## 0.131 0.004 0.204
## you can speed up a for-loop by pre-allocating space
system.time({
newVec <- numeric(100000)
for(ii in 1:100000){
newVec[ii] <- ii + 100
}
})
## user system elapsed
## 0.200 0.011 0.307